Libraries import

#install.packages("dplyr")
#install.packages("lubridate")
#install.packages("ggplot2")
#install.packages("corrplot")
#install.packages("PerformanceAnalytics")
#install.packages("randomForest")
#install.packages("caret")
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.4.3
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
library(PerformanceAnalytics)
## Warning: package 'PerformanceAnalytics' was built under R version 4.4.3
## Loading required package: xts
## Warning: package 'xts' was built under R version 4.4.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.4.3
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
## 
##     legend
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
library(cluster)

Data processing

# data is taken from https://www.kaggle.com/datasets/ethon0426/lending-club-20072020q1
data <- read.csv('Loan_status_2007-2020Q3.gzip')
head(data)
start_date = "2019-01-01"
end_date = "2019-12-31"


data <- data %>%
  mutate(
    issue_date_parsed = parse_date_time(issue_d, orders = "b-Y")
  )

df_2019 <- data %>% filter(issue_date_parsed >= ymd(start_date) &
                          issue_date_parsed <= ymd(end_date))
rm(data)
head(df_2019)

Removal of columns where number na is bigger than threshold

threshold <- 0.5
missing_pct <- colMeans(is.na(df_2019))
df_clean_2019 <- df_2019[, missing_pct <= threshold]
df_clean_2019

Saving to csv

write.csv(df_clean_2019, "df_clean_2019.csv", row.names = FALSE)

Turning some columns to numeric

Loan status

df_clean_2019$loan_status_num <- dplyr::case_when(
  df_clean_2019$loan_status %in% c("Fully Paid") ~ 1,
  df_clean_2019$loan_status %in% c("Current", "In Grace Period") ~ 0,
  df_clean_2019$loan_status %in% c("Late (31-120 days)", "Late (16-30 days)", "Charged Off", "Default") ~ -1,
)

Fico score creation

df_clean_2019$fico_score = (df_clean_2019$fico_range_low + df_clean_2019$fico_range_high) / 2

Selecting of important variables

cols_to_include <- c('loan_status_num', 'annual_inc', 'dti', 'loan_amnt',
                     'fico_score', 'emp_title', 'emp_length', 'home_ownership',
                     'verification_status', 'title', 'issue_date_parsed',
                     'addr_state', 'term'#,
                     # Tried these ones, but they didn't add value to the performance of models
                     #'acc_now_delinq', 'earliest_cr_line', 'open_acc',
                     #'pub_rec', 'revol_bal', 'mort_acc', 'pub_rec_bankruptcies'
                     )

df_clean_2019 <- df_clean_2019[(colnames(df_clean_2019) %in% cols_to_include)]
df_clean_2019
cols_numeric <- sapply(df_clean_2019, is.numeric)
summary(df_clean_2019[, cols_numeric])
##    loan_amnt       annual_inc           dti         loan_status_num  
##  Min.   : 1000   Min.   :      0   Min.   :  0.00   Min.   :-1.0000  
##  1st Qu.: 8500   1st Qu.:  49800   1st Qu.: 12.57   1st Qu.: 0.0000  
##  Median :14400   Median :  70000   Median : 18.86   Median : 0.0000  
##  Mean   :16470   Mean   :  85471   Mean   : 20.76   Mean   : 0.0792  
##  3rd Qu.:23000   3rd Qu.: 100000   3rd Qu.: 26.13   3rd Qu.: 0.0000  
##  Max.   :40000   Max.   :9682505   Max.   :999.00   Max.   : 1.0000  
##                                    NA's   :1145                      
##    fico_score   
##  Min.   :662.0  
##  1st Qu.:682.0  
##  Median :702.0  
##  Mean   :708.5  
##  3rd Qu.:727.0  
##  Max.   :847.5  
## 

Tables of important variables

Loan Status

table(df_clean_2019$loan_status) / length(df_clean_2019$loan_status)
## 
##         -1          0          1 
## 0.04016159 0.84048083 0.11935758
table(df_clean_2019$loan_status)
## 
##     -1      0      1 
##  20808 435459  61840

Employment length

table(df_clean_2019$emp_length) / length(df_clean_2019$emp_length)
## 
##              < 1 year     1 year  10+ years    2 years    3 years    4 years 
## 0.08926921 0.12527335 0.06852060 0.29850977 0.08664040 0.07741258 0.05931014 
##    5 years    6 years    7 years    8 years    9 years 
## 0.06367990 0.04167865 0.03470519 0.03095693 0.02404330
table(df_clean_2019$emp_length)
## 
##            < 1 year    1 year 10+ years   2 years   3 years   4 years   5 years 
##     46251     64905     35501    154660     44889     40108     30729     32993 
##   6 years   7 years   8 years   9 years 
##     21594     17981     16039     12457

Turning into numeric var

emp_map <- c("< 1 year" = 0.5,
             "1 year"   = 1,
             "2 years"  = 2,
             "3 years"  = 3,
             "4 years"  = 4,
             "5 years"  = 5,
             "6 years"  = 6,
             "7 years"  = 7,
             "8 years"  = 8,
             "9 years"  = 9,
             "10+ years"= 10)
df_clean_2019$emp_length_num <- unname(emp_map[df_clean_2019$emp_length])
df_clean_2019 <- df_clean_2019[!is.na(df_clean_2019$emp_length_num), ]
tab <- table(df_clean_2019$emp_length_num)

barplot(tab, las = 2, col = "skyblue", main = "Employment Years Distribution")

Employment title

Too many different titles, so not included into the model

temp = table(df_clean_2019$emp_title)
temp = sort(temp[temp>1], decreasing = TRUE)
head(temp, 10)
## 
##                           Teacher          Manager           Driver 
##            31360             8857             7373             3839 
## Registered Nurse       Supervisor               RN  Project Manager 
##             3741             2890             2825             2372 
##            Sales   Office Manager 
##             2363             2355

Home ownership

table(df_clean_2019$home_ownership) / length(df_clean_2019$home_ownership)
## 
##          ANY     MORTGAGE         NONE          OWN         RENT 
## 4.386932e-03 4.948438e-01 2.119291e-06 1.076706e-01 3.930966e-01
table(df_clean_2019$home_ownership)
## 
##      ANY MORTGAGE     NONE      OWN     RENT 
##     2070   233495        1    50805   185485

Create dummies for these categories

df_clean_2019$home_ownership <- factor(df_clean_2019$home_ownership)

dummies <- model.matrix(~ home_ownership - 1, data = df_clean_2019)

df_clean_2019 <- cbind(df_clean_2019, as.data.frame(dummies))

# Delete columns, since only few values are present
df_clean_2019 <- df_clean_2019[, !(names(df_clean_2019) %in% c("home_ownershipANY", "home_ownershipNONE"))]

Term

table(df_clean_2019$term) / length(df_clean_2019$term)
## 
##  36 months  60 months 
##   0.669645   0.330355
table(df_clean_2019$term)
## 
##  36 months  60 months 
##     315976     155880

Turning into dummies

df_clean_2019$term <- factor(df_clean_2019$term)

dummies <- model.matrix(~ term - 1, data = df_clean_2019)

df_clean_2019 <- cbind(df_clean_2019, as.data.frame(dummies))

Verification status

table(df_clean_2019$verification_status) / length(df_clean_2019$verification_status)
## 
##    Not Verified Source Verified        Verified 
##       0.4432496       0.4201854       0.1365650
table(df_clean_2019$verification_status)
## 
##    Not Verified Source Verified        Verified 
##          209150          198267           64439

Turning into dummies

df_clean_2019$verification_status <- factor(df_clean_2019$verification_status)

dummies <- model.matrix(~ verification_status - 1, data = df_clean_2019)

df_clean_2019 <- cbind(df_clean_2019, as.data.frame(dummies))

Address

sort(table(df_clean_2019$addr_state) / length(df_clean_2019$addr_state), TRUE)
## 
##          CA          TX          NY          FL          IL          NJ 
## 0.137118104 0.086507324 0.076658981 0.074393459 0.039520532 0.036595911 
##          GA          OH          PA          VA          NC          MI 
## 0.033404259 0.032177190 0.031560476 0.027084534 0.026914991 0.024147197 
##          AZ          MD          MA          WA          CO          MN 
## 0.024079380 0.023918314 0.023909837 0.021400597 0.020565596 0.017017904 
##          TN          IN          CT          MO          NV          WI 
## 0.016979757 0.016697891 0.016587688 0.015680631 0.014237394 0.013171391 
##          SC          OR          AL          LA          KY          OK 
## 0.012845021 0.012637330 0.010876200 0.010537113 0.009452036 0.009070564 
##          KS          AR          UT          MS          WV          NH 
## 0.008199535 0.007536197 0.006936438 0.006224357 0.005650029 0.005171069 
##          NM          NE          RI          HI          ID          ME 
## 0.005010003 0.004920993 0.004704825 0.004418721 0.003545573 0.003346360 
##          DE          MT          VT          ND          AK          DC 
## 0.002873758 0.002818657 0.002401156 0.002223136 0.002163796 0.002079024 
##          SD          WY 
## 0.002057831 0.001970940
sort(table(df_clean_2019$addr_state), TRUE)
## 
##    CA    TX    NY    FL    IL    NJ    GA    OH    PA    VA    NC    MI    AZ 
## 64700 40819 36172 35103 18648 17268 15762 15183 14892 12780 12700 11394 11362 
##    MD    MA    WA    CO    MN    TN    IN    CT    MO    NV    WI    SC    OR 
## 11286 11282 10098  9704  8030  8012  7879  7827  7399  6718  6215  6061  5963 
##    AL    LA    KY    OK    KS    AR    UT    MS    WV    NH    NM    NE    RI 
##  5132  4972  4460  4280  3869  3556  3273  2937  2666  2440  2364  2322  2220 
##    HI    ID    ME    DE    MT    VT    ND    AK    DC    SD    WY 
##  2085  1673  1579  1356  1330  1133  1049  1021   981   971   930

Turning into dummies

df_clean_2019$addr_state <- factor(df_clean_2019$addr_state)

dummies <- model.matrix(~ addr_state - 1, data = df_clean_2019)

df_clean_2019 <- cbind(df_clean_2019, as.data.frame(dummies))

Title

sort(table(df_clean_2019$title) / length(df_clean_2019$title), TRUE)
## 
##      Debt consolidation Credit card refinancing        Home improvement 
##            0.5439901156            0.2723373233            0.0603679936 
##                   Other          Major purchase        Medical expenses 
##            0.0561908718            0.0178274728            0.0109058696 
##           Car financing             Home buying                Business 
##            0.0086127971            0.0085386220            0.0082419213 
##                Vacation   Moving and relocation              Green loan 
##            0.0071038622            0.0053851175            0.0004980333
sort(table(df_clean_2019$title), TRUE)
## 
##      Debt consolidation Credit card refinancing        Home improvement 
##                  256685                  128504                   28485 
##                   Other          Major purchase        Medical expenses 
##                   26514                    8412                    5146 
##           Car financing             Home buying                Business 
##                    4064                    4029                    3889 
##                Vacation   Moving and relocation              Green loan 
##                    3352                    2541                     235

Turning into dummies

df_clean_2019$title <- factor(df_clean_2019$title)

dummies <- model.matrix(~ title - 1, data = df_clean_2019)

df_clean_2019 <- cbind(df_clean_2019, as.data.frame(dummies))

# Removing learning and training and Green loan because only few of them 
df_clean_2019 <- df_clean_2019[, !(names(df_clean_2019) %in%
                                     c("titleLearning and training",
                                       "titleGreen loan"))]

Charts and NA/outliers removal

Loan Amount

ggplot(df_clean_2019, aes(x = loan_amnt)) +
  geom_histogram(bins = 50, fill = "blue", color = "black") +
  labs(title = "Loan Amount Distribution", x = "Loan Amount", y = "Frequency")

Annual Income

boxplot(df_clean_2019$annual_inc)

boxplot(df_clean_2019[df_clean_2019$annual_inc < 500000,]$annual_inc)

Outliers removal Income

As per 2 charts above, there are outliers only on top of the distribution, so delete top 1%

upper_bound <- quantile(df_clean_2019$annual_inc, 0.99)
len_before <- length(df_clean_2019$annual_inc)

df_clean_2019 <- df_clean_2019[df_clean_2019$annual_inc < upper_bound, ]

len_after <- length(df_clean_2019$annual_inc)

Absolute number of deleted lines:

len_before - len_after
## [1] 4719
ggplot(df_clean_2019, aes(x = annual_inc)) +
  geom_histogram(bins = 50, fill = "blue", color = "black") +
  labs(title = "Annual Income Distribution", x = "Annual Income", y = "Frequency")

DTI

boxplot(df_clean_2019$dti)

boxplot(df_clean_2019[df_clean_2019$dti < 200,]$dti)

Remove only top 1% and NAs

Number of NAs:

sum(is.na(df_clean_2019$dti))
## [1] 141
df_clean_2019 <- df_clean_2019[!is.na(df_clean_2019$dti), ]
upper_bound <- quantile(df_clean_2019$dti, 0.99, na.rm = TRUE)
len_before <- length(df_clean_2019$dti)

df_clean_2019 <- df_clean_2019[df_clean_2019$dti < upper_bound, ]

len_after <- length(df_clean_2019$dti)

Number of removed rows:

len_before - len_after
## [1] 4671
ggplot(df_clean_2019, aes(x = dti)) +
  geom_histogram(bins = 50, fill = "blue", color = "black") +
  labs(title = "Debt-To-Income Ratio Distribution", x = "DTI Ratio", y = "Frequency")

FICO score

sum(is.na(df_clean_2019$fico_score))
## [1] 0
ggplot(df_clean_2019, aes(x = fico_score)) +
  geom_histogram(bins = 38, fill = "blue", color = "black") +
  labs(title = "FICO Score Distribution", x = "FICO score", y = "Frequency")

Correlations

cols_numeric <- sapply(df_clean_2019, is.numeric)
cor_matrix = cor(df_clean_2019[,cols_numeric], use="complete.obs")

High correlation with Loan Status

cor_vec <- cor_matrix["loan_status_num", ]
cor_vec <- cor_vec[!is.na(cor_vec)] 
selected <- cor_vec[abs(cor_vec) > 0.05]

sort(selected, TRUE)
## loan_status_num      fico_score  term 36 months  term 60 months       loan_amnt 
##      1.00000000      0.06675817      0.05707753     -0.05707753     -0.06667224
#col_range <- colorRampPalette(c("blue", "white", "red"))(200)

#corrplot(cor_matrix, method = "color", type="lower", diag = FALSE,
#         tl.cex=0.7,col = col_range, tl.col = "black")

Leaving only good and bad loans

df_clean_2019 <- df_clean_2019[df_clean_2019$loan_status_num %in% c(1, -1), ]

# removes spaces, <, and other stuff from names
names(df_clean_2019) <- make.names(names(df_clean_2019))

Balancing dataset by undersampling +1

table(df_clean_2019$loan_status_num)
## 
##    -1     1 
## 17799 55548
df_balanced <- df_clean_2019 %>%
  group_by(loan_status_num) %>%
  #sample_n(5000)
  sample_n(min(table(df_clean_2019$loan_status_num)))
table(df_balanced$loan_status_num)
## 
##    -1     1 
## 17799 17799

Analysis of continuous variables

cols_analysis <- c('annual_inc', 'loan_status_num', 'dti', 'fico_score',
                   'loan_amnt', 'emp_length_num'
                   )

sample_idx <- sample(1:nrow(df_balanced), 5000)
analysis_sample <- df_balanced[sample_idx, cols_analysis]
chart.Correlation(analysis_sample, histogram=TRUE, pch=19)

Supervised learning

Random forest

Parameters tuning random forest

Using only subset of 5K lines for performance

sample_idx <- sample(1:nrow(df_balanced), 5000)
finetuning_sample <- df_balanced[sample_idx, sapply(df_balanced, is.numeric)]
set.seed(123)

# Train/Validation/Test Split
train_index <- createDataPartition(finetuning_sample$loan_status_num, p = 0.7, list = FALSE)
train_valid <- finetuning_sample[train_index, sapply(finetuning_sample, is.numeric)]
test        <- finetuning_sample[-train_index, sapply(finetuning_sample, is.numeric)]

# Split train into train+validation (e.g. 80/20 split of train_valid)
train_index2 <- createDataPartition(train_valid$loan_status_num, p = 0.8, list = FALSE)
train <- train_valid[train_index2, ]
valid <- train_valid[-train_index2, ]

train$loan_status_num <- factor(train$loan_status_num, levels = c(-1, 1))
valid$loan_status_num <- factor(valid$loan_status_num, levels = c(-1, 1))
test$loan_status_num  <- factor(test$loan_status_num,  levels = c(-1, 1))


# Grid of parameters
param_grid <- expand.grid(
  ntree   = c(100, 200, 300),
  mtry    = c(3, 5, 7, floor(sqrt(ncol(train) - 1))),
  nodesize = c(1, 5, 10)
)

best_f1 <- 0
best_params <- NULL

# Loop over parameters
for (i in 1:nrow(param_grid)) {
  params <- param_grid[i, ]
  
  rf_model <- randomForest(
    loan_status_num ~ .,
    data = train,
    ntree = params$ntree,
    mtry = params$mtry,
    nodesize = params$nodesize
  )
  
  pred_valid <- predict(rf_model, newdata = valid)
  
  cm <- confusionMatrix(pred_valid, valid$loan_status_num, positive = "1")
  f1 <- cm$byClass["F1"]
  
  if (f1 > best_f1) {
    best_f1 <- f1
    best_params <- params
  }
}


# Retrain with best params on train+valid
rf_final <- randomForest(
  loan_status_num ~ .,
  data = rbind(train, valid),
  ntree = best_params$ntree,
  mtry = best_params$mtry,
  nodesize = best_params$nodesize,
  importance = TRUE
)

# Test evaluation
pred_test <- predict(rf_final, newdata = test)
cm_test <- confusionMatrix(pred_test, test$loan_status_num, positive = "1")

precision <- cm_test$byClass["Precision"]
recall    <- cm_test$byClass["Recall"]
f1        <- cm_test$byClass["F1"]

precision; recall; f1
## Precision 
## 0.6383866
##    Recall 
## 0.6304945
##       F1 
## 0.634416
best_params

Train using all the data

# Split into train/test
set.seed(123)
train_index <- createDataPartition(df_balanced$loan_status_num, p = 0.7, list = FALSE)

train <- df_balanced[train_index, sapply(df_balanced, is.numeric)]
test  <- df_balanced[-train_index, sapply(df_balanced, is.numeric)]

train$loan_status_num <- factor(train$loan_status_num, levels = c(-1, 1))
test$loan_status_num <- factor(test$loan_status_num, levels = c(-1, 1))

# Fit Random Forest model with the best params from above
rf_model <- randomForest(
  loan_status_num ~ .,
  data = train,
  ntree = 300,
  mtry = 8,
  nodesize = 5,
  importance = TRUE
)

print(rf_model)
## 
## Call:
##  randomForest(formula = loan_status_num ~ ., data = train, ntree = 300,      mtry = 8, nodesize = 5, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 300
## No. of variables tried at each split: 8
## 
##         OOB estimate of  error rate: 37.43%
## Confusion matrix:
##      -1    1 class.error
## -1 8448 4012   0.3219904
## 1  5316 7144   0.4266453
# Predict on test set
pred <- predict(rf_model, newdata = test)


# Confusion matrix
cm <- confusionMatrix(factor(pred), factor(test$loan_status_num), positive = "1")

precision <- cm$byClass["Precision"]
recall    <- cm$byClass["Recall"]
f1        <- cm$byClass["F1"]

precision; recall; f1
## Precision 
## 0.6446764
##    Recall 
## 0.5783855
##        F1 
## 0.6097344

Confusion matrix

cm_table <- as.data.frame(cm$table)

ggplot(cm_table, aes(x = Prediction, y = Reference, fill = Freq)) +
  geom_tile(color = "white") +
  geom_text(aes(label = Freq), vjust = 1.5, color = "white", size = 6) +
  scale_fill_gradient(low = "lightblue", high = "darkblue") +
  labs(title = "Confusion Matrix", x = "Predicted", y = "Actual") +
  theme_minimal(base_size = 14)

importance(rf_final)
##                                             -1           1 MeanDecreaseAccuracy
## loan_amnt                           8.29101785  9.15621885         13.742330764
## annual_inc                          3.95380145  0.67504967          3.687587825
## dti                                 4.83882772  3.58066270          5.723966591
## fico_score                         16.11776446  9.16560361         17.995202313
## emp_length_num                      5.15320524  2.98340488          6.118892601
## home_ownershipMORTGAGE             -0.59375188  8.67881029          8.476456152
## home_ownershipOWN                   0.04283700  2.31127422          1.749926493
## home_ownershipRENT                  0.62263873  6.03259259          5.994984476
## term.36.months                      5.65651172  1.50784366          5.262982155
## term.60.months                      6.74561720  0.05659777          5.024906966
## verification_statusNot.Verified    -0.90973875  1.25520767          0.415582406
## verification_statusSource.Verified -0.63052585  1.54776044          0.625837088
## verification_statusVerified         2.42091882 -0.79788160          1.299834356
## addr_stateAK                        0.39606317 -0.92316789         -0.459761941
## addr_stateAL                        3.37411166  2.21049974          3.972506987
## addr_stateAR                       -0.52711788 -1.40012063         -1.417148891
## addr_stateAZ                       -2.34300497  1.34951789         -0.301010016
## addr_stateCA                        0.66012100 -0.23464225          0.388673619
## addr_stateCO                       -2.19378892  1.57317803         -0.274866099
## addr_stateCT                        2.97395580 -1.16808804          1.275795303
## addr_stateDC                       -1.35063815 -0.71617193         -1.398285106
## addr_stateDE                        0.48089726  0.44433195          0.670091780
## addr_stateFL                        1.21035388 -1.78887208         -0.505855676
## addr_stateGA                        4.05959606  1.42371138          3.898927236
## addr_stateHI                       -1.17565081  3.20648856          1.309027867
## addr_stateID                        1.52920212  0.44195763          1.415237287
## addr_stateIL                        1.34139422 -3.24247757         -1.222563179
## addr_stateIN                        5.03155204 -0.75053824          3.201914123
## addr_stateKS                       -2.91576612 -0.52919664         -2.345222347
## addr_stateKY                        2.23920888 -1.83993949          0.576464908
## addr_stateLA                       -0.21545225 -0.97500490         -0.848973026
## addr_stateMA                       -3.00895984 -0.73663274         -2.633522339
## addr_stateMD                       -0.71894780 -1.28949161         -1.453702005
## addr_stateME                        2.11988862  3.76816777          4.215005885
## addr_stateMI                       -1.03227739 -1.62747391         -1.866351360
## addr_stateMN                        1.09000186 -0.59074668          0.372756755
## addr_stateMO                        1.62740907 -4.44237410         -1.904696627
## addr_stateMS                        0.24644916  3.32120676          2.673742820
## addr_stateMT                        0.66146695 -1.37271926         -0.491689017
## addr_stateNC                       -2.35095837 -2.49874316         -3.491438757
## addr_stateND                       -1.53151208 -1.81596578         -2.301164898
## addr_stateNE                        1.90407976  1.66820076          2.338437440
## addr_stateNH                        1.65759506 -0.18613992          1.162169040
## addr_stateNJ                       -0.68534390 -2.59714316         -2.490974612
## addr_stateNM                       -1.47131227 -1.24249534         -1.937865664
## addr_stateNV                       -0.21339571 -1.73050942         -1.379349994
## addr_stateNY                        0.06291361 -1.69438697         -1.187413003
## addr_stateOH                        1.64214073  3.23048692          3.475510376
## addr_stateOK                        0.85280236 -1.10235891         -0.001952085
## addr_stateOR                        1.10790479  1.38364793          1.809467457
## addr_statePA                       -1.49814501  0.51056906         -0.760082448
## addr_stateRI                        1.88197218  1.64787284          2.488562352
## addr_stateSC                        0.10325285  3.45328602          2.622884294
## addr_stateSD                       -1.65484042  0.03900032         -1.647096243
## addr_stateTN                       -1.51296139 -3.30042110         -3.326739531
## addr_stateTX                        2.13579366  0.07401098          1.406763165
## addr_stateUT                       -0.50916632  0.28136950         -0.151330943
## addr_stateVA                       -2.30776219 -1.19971882         -2.755581369
## addr_stateVT                       -0.77084927 -0.42607578         -0.763798166
## addr_stateWA                        0.26997534  1.67608954          1.349443593
## addr_stateWI                        3.82051711  1.99691636          3.880073118
## addr_stateWV                       -1.56072383 -0.73502415         -1.538595827
## addr_stateWY                        0.81494045 -1.36614597         -0.330227000
## titleBusiness                       1.11864028 -1.73783939         -0.412674903
## titleCar.financing                 -0.82354576 -1.78819584         -1.943322528
## titleCredit.card.refinancing       -0.33033525 -0.77473749         -0.767799545
## titleDebt.consolidation            -1.77387592 -0.84295640         -2.132521446
## titleHome.buying                    2.12032633 -1.05580849          1.003457374
## titleHome.improvement               3.14778672 -2.64622385          0.132043232
## titleMajor.purchase                 5.12506238 -0.34537053          3.113405774
## titleMedical.expenses               5.38592664 -1.44464376          2.614195337
## titleMoving.and.relocation         -1.22311160 -0.49573910         -1.355494939
## titleOther                          1.28128941 -0.84671028          0.219322437
## titleVacation                      -1.19589292 -1.99182281         -2.212047399
##                                    MeanDecreaseGini
## loan_amnt                               146.3833392
## annual_inc                              136.0549752
## dti                                     155.6236462
## fico_score                              146.9553434
## emp_length_num                           83.1044750
## home_ownershipMORTGAGE                   24.1386700
## home_ownershipOWN                        11.9550033
## home_ownershipRENT                       19.6257454
## term.36.months                           16.9355905
## term.60.months                           16.1341952
## verification_statusNot.Verified          19.2747111
## verification_statusSource.Verified       19.4349010
## verification_statusVerified              16.8638544
## addr_stateAK                              0.9647126
## addr_stateAL                              5.8823276
## addr_stateAR                              3.7687084
## addr_stateAZ                              7.8349027
## addr_stateCA                             18.6943540
## addr_stateCO                              7.5399284
## addr_stateCT                              8.0809310
## addr_stateDC                              1.2673111
## addr_stateDE                              2.1676142
## addr_stateFL                             15.6394310
## addr_stateGA                             10.6034393
## addr_stateHI                              3.0612575
## addr_stateID                              2.3703887
## addr_stateIL                             10.4424349
## addr_stateIN                              8.2274624
## addr_stateKS                              2.9969907
## addr_stateKY                              4.7590798
## addr_stateLA                              4.2082157
## addr_stateMA                              8.9919194
## addr_stateMD                              6.6298084
## addr_stateME                              1.6782767
## addr_stateMI                              8.3023207
## addr_stateMN                              6.0637229
## addr_stateMO                              6.2347063
## addr_stateMS                              3.7768855
## addr_stateMT                              2.5219835
## addr_stateNC                              8.6510352
## addr_stateND                              1.3565198
## addr_stateNE                              2.3093275
## addr_stateNH                              2.6550961
## addr_stateNJ                              8.9982381
## addr_stateNM                              2.7137085
## addr_stateNV                              5.5865466
## addr_stateNY                             14.0244071
## addr_stateOH                             10.3167857
## addr_stateOK                              4.5012326
## addr_stateOR                              7.6884669
## addr_statePA                              7.7368120
## addr_stateRI                              2.1944698
## addr_stateSC                              5.2399574
## addr_stateSD                              0.7640156
## addr_stateTN                              7.0572175
## addr_stateTX                             16.2925412
## addr_stateUT                              3.3538298
## addr_stateVA                              8.2990084
## addr_stateVT                              0.9533942
## addr_stateWA                              9.4474094
## addr_stateWI                              7.4966424
## addr_stateWV                              2.4244645
## addr_stateWY                              0.9959886
## titleBusiness                             3.0182945
## titleCar.financing                        3.6006825
## titleCredit.card.refinancing             17.0846978
## titleDebt.consolidation                  19.6442365
## titleHome.buying                          3.0181866
## titleHome.improvement                    10.6594335
## titleMajor.purchase                       7.4402818
## titleMedical.expenses                     5.1860533
## titleMoving.and.relocation                1.8111881
## titleOther                               11.4748597
## titleVacation                             2.6891711
varImpPlot(rf_final, n.var = 10)

Logistic regression

basic_vars = c("loan_amnt", "annual_inc", "dti",
      "loan_status_num", "fico_score",
      "emp_length_num", "term.60.months",
      "home_ownershipRENT", "home_ownershipMORTGAGE",
      "verification_statusVerified", "verification_statusSource.Verified"
      )


# Fit logistic regression
logit_model <- glm(loan_status_num ~ ., data = train[(colnames(train) %in% basic_vars)], family = binomial)

# Predict probabilities
pred_prob <- predict(logit_model, newdata = test[(colnames(test) %in% basic_vars)], type = "response")

# Convert probabilities to class (threshold 0.5)
pred_class <- ifelse(pred_prob > 0.5, 1, -1)


cm <- confusionMatrix(factor(pred_class, levels=c(-1,1)),
                      factor(test$loan_status_num, levels=c(-1,1)),
                      positive = "1")

print(cm)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   -1    1
##         -1 3483 2106
##         1  1856 3233
##                                           
##                Accuracy : 0.629           
##                  95% CI : (0.6197, 0.6381)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.2579          
##                                           
##  Mcnemar's Test P-Value : 7.626e-05       
##                                           
##             Sensitivity : 0.6055          
##             Specificity : 0.6524          
##          Pos Pred Value : 0.6353          
##          Neg Pred Value : 0.6232          
##              Prevalence : 0.5000          
##          Detection Rate : 0.3028          
##    Detection Prevalence : 0.4766          
##       Balanced Accuracy : 0.6290          
##                                           
##        'Positive' Class : 1               
## 
precision <- cm$byClass["Precision"]
recall    <- cm$byClass["Recall"]
f1        <- cm$byClass["F1"]

precision; recall; f1
## Precision 
## 0.6352918
##    Recall 
## 0.6055441
##        F1 
## 0.6200614
cm_table <- as.data.frame(cm$table)

ggplot(cm_table, aes(x = Prediction, y = Reference, fill = Freq)) +
  geom_tile(color = "white") +
  geom_text(aes(label = Freq), vjust = 1.5, color = "white", size = 6) +
  scale_fill_gradient(low = "lightblue", high = "darkblue") +
  labs(title = "Confusion Matrix", x = "Predicted", y = "Actual") +
  theme_minimal(base_size = 14)

summary(logit_model)
## 
## Call:
## glm(formula = loan_status_num ~ ., family = binomial, data = train[(colnames(train) %in% 
##     basic_vars)])
## 
## Coefficients:
##                                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                        -7.036e+00  2.888e-01 -24.366  < 2e-16 ***
## loan_amnt                          -2.795e-05  1.573e-06 -17.763  < 2e-16 ***
## annual_inc                          1.154e-06  3.292e-07   3.506 0.000455 ***
## dti                                -1.260e-02  1.412e-03  -8.922  < 2e-16 ***
## fico_score                          1.067e-02  4.007e-04  26.628  < 2e-16 ***
## emp_length_num                      3.007e-02  3.617e-03   8.313  < 2e-16 ***
## home_ownershipMORTGAGE              3.651e-01  4.253e-02   8.585  < 2e-16 ***
## home_ownershipRENT                 -2.103e-01  4.326e-02  -4.861 1.17e-06 ***
## term.60.months                     -4.297e-01  3.170e-02 -13.554  < 2e-16 ***
## verification_statusSource.Verified -1.677e-02  2.909e-02  -0.576 0.564307    
## verification_statusVerified        -2.360e-01  4.048e-02  -5.829 5.56e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 34546  on 24919  degrees of freedom
## Residual deviance: 32354  on 24909  degrees of freedom
## AIC: 32376
## 
## Number of Fisher Scoring iterations: 4
exp(coef(logit_model))
##                        (Intercept)                          loan_amnt 
##                       0.0008793785                       0.9999720525 
##                         annual_inc                                dti 
##                       1.0000011540                       0.9874773055 
##                         fico_score                     emp_length_num 
##                       1.0107270670                       1.0305228183 
##             home_ownershipMORTGAGE                 home_ownershipRENT 
##                       1.4406485343                       0.8103545336 
##                     term.60.months verification_statusSource.Verified 
##                       0.6507274910                       0.9833698193 
##        verification_statusVerified 
##                       0.7898077185

Interpretation: additional employment year decreases chance of default by 2 percent; additional dti point increases default by 2 %

Unsupervised learning

set.seed(123)
sample_idx <- sample(1:nrow(df_balanced), 10000)
num_cols <- df_balanced[sample_idx,]

x = c("loan_amnt", "annual_inc", "dti",
      "loan_status_num", "fico_score",
      "emp_length_num")

num_cols <- num_cols[(colnames(num_cols) %in% x)]

# Remove columns with zero variance
num_cols <- num_cols[, apply(num_cols, 2, var) != 0]

pca_res <- prcomp(num_cols, scale. = TRUE)
summary(pca_res)
## Importance of components:
##                           PC1    PC2    PC3    PC4    PC5     PC6
## Standard deviation     1.1932 1.1187 1.0513 0.9659 0.8710 0.72669
## Proportion of Variance 0.2373 0.2086 0.1842 0.1555 0.1264 0.08801
## Cumulative Proportion  0.2373 0.4459 0.6300 0.7856 0.9120 1.00000
pca_df <- as.data.frame(pca_res$x[,1:2])
ggplot(pca_df, aes(PC1, PC2)) +
  geom_point(alpha=0.6) +
  theme_minimal()

Loadings visualisation

pca_df$loan_status <- factor(num_cols$loan_status_num)

# Loadings
loadings <- as.data.frame(pca_res$rotation[,1:2])
loadings$varname <- rownames(loadings)
loadings[,1:2] <- loadings[,1:2] * 5     # scale arrows so they're visible

explained_var <- round(100 * summary(pca_res)$importance[2, 1:2], 1)

# Plot scores + arrows
ggplot(pca_df, aes(PC1, PC2, color=loan_status)) +
  geom_point(size=0.7, alpha=0.6) +
  geom_segment(data=loadings, aes(x=0, y=0, xend=PC1, yend=PC2),
               arrow=arrow(length=unit(0.2,"cm")), color="black") +
  geom_text(data=loadings, aes(x=PC1, y=PC2, label=varname),
            color="black", vjust=1.5, size=3) +
  theme_minimal() +
  labs(
  title="PCA Biplot",
    color="Loan Status",
    x = paste0("PC1 (", explained_var[1], "%)"),
    y = paste0("PC2 (", explained_var[2], "%)")
  )

loadings

Clustering

K means

Choosing K

set.seed(123)
sample_idx <- sample(1:nrow(df_balanced), 10000)
num_cols_kmeans <- df_balanced[sample_idx,]
x = c("loan_amnt", "annual_inc", "dti",
      "loan_status_num", "fico_score",
      "emp_length_num")

num_cols_kmeans <- num_cols_kmeans[(colnames(num_cols_kmeans) %in% x)]

# Remove NAs
num_cols_kmeans <- na.omit(num_cols_kmeans)

# Scale numeric features
X <- scale(num_cols_kmeans %>% select(-loan_status_num))
## Adding missing grouping variables: `loan_status_num`
set.seed(123)
wcss <- sapply(1:10, function(k) {
  kmeans(X, centers = k, nstart = 25)$tot.withinss
})
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
plot(1:10, wcss, type="b", pch=19,
     xlab="Number of clusters (k)",
     ylab="Within-cluster sum of squares",
     main="Elbow Method")

set.seed(123)
sil_width <- sapply(2:10, function(k){
  km <- kmeans(X, centers = k, nstart = 25)
  ss <- silhouette(km$cluster, dist(X))
  mean(ss[, 3])
})
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
## Warning: did not converge in 10 iterations
plot(2:10, sil_width, type="b", pch=19,
     xlab="Number of clusters (k)",
     ylab="Average silhouette width",
     main="Silhouette Method")

From above, K = 3

Results

sample_idx <- sample(1:nrow(df_balanced), 35000)
df_seg <- df_balanced[sample_idx,]
x = c("loan_amnt", "annual_inc", "dti",
      "loan_status_num", "fico_score",
      "emp_length_num")

df_seg <- df_seg[(colnames(df_seg) %in% x)]

df_seg <- na.omit(df_seg)

X <- scale(df_seg %>% select(-loan_status_num))
## Adding missing grouping variables: `loan_status_num`
set.seed(123)
km <- kmeans(X, centers = 3, nstart = 25)

# Add cluster labels
df_seg$cluster <- factor(km$cluster)

# Visualize
pca <- prcomp(X)
pca_df <- as.data.frame(pca$x[,1:2])
pca_df$cluster <- df_seg$cluster
pca_df$loan_status <- df_seg$loan_status_num

ggplot(pca_df, aes(PC1, PC2, color=cluster)) +
  geom_point(alpha=0.6, size=0.7) +
  labs(title="Borrower Segmentation via K-means", color="Cluster") +
  theme_minimal()

#ggplot(pca_df, aes(PC1, PC2, color=factor(loan_status), shape=cluster)) +
#  geom_point(alpha=0.6, size=0.7) +
#  labs(title="Borrower Segmentation via K-means", color="Loan Status", shape="Cluster") +
#  theme_minimal()
summary(pca)
## Importance of components:
##                           PC1    PC2    PC3    PC4    PC5     PC6
## Standard deviation     1.2003 1.1149 1.0443 0.9666 0.8744 0.72593
## Proportion of Variance 0.2401 0.2072 0.1818 0.1557 0.1274 0.08783
## Cumulative Proportion  0.2401 0.4473 0.6290 0.7847 0.9122 1.00000
# Summarize clusters
cluster_summary <- df_seg %>%
  group_by(cluster) %>%
  summarise(
    n = n(),
    avg_loan = mean(loan_amnt, na.rm=TRUE),
    avg_income = mean(annual_inc, na.rm=TRUE),
    avg_dti = mean(dti, na.rm=TRUE),
    avg_fico = mean(fico_score, na.rm=TRUE),
    avg_emp_length = mean(emp_length_num, na.rm=TRUE),
    default_rate = mean(loan_status_num == -1, na.rm=TRUE)
  )

print(cluster_summary)
## # A tibble: 3 × 8
##   cluster     n avg_loan avg_income avg_dti avg_fico avg_emp_length default_rate
##   <fct>   <int>    <dbl>      <dbl>   <dbl>    <dbl>          <dbl>        <dbl>
## 1 1        7136   28773.    142662.    15.7     718.           5.92     0.556   
## 2 2       13542   13853.     65780.    21.3     697.           5.05     1.000   
## 3 3       14322   11417.     70966.    18.7     712.           5.67     0.000419